First load the required libraries

rm(list = ls())

library(vtable)
library(scales)
library(corrplot)
library(nortest)
library(randomForest)
library(foreign)
library(GGally)
library(haven)
library(magrittr)
library(data.table)
library(dplyr)
library(plyr)
library(nycflights13)
library(tidyverse)
library(datasets)
library(readxl)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(plyr)
library(factoextra)
library(readxl)
library(plotly)
library(naivebayes)
library(caTools)
library(devtools)
library(ggcorrplot)
library(usethis)
library(fastDummies)
library(recipes)
library(caretEnsemble)
library(readr)
library("gplots")
library(dominanceanalysis)
library(caTools)
library(randomForest) 
library(xgboost) 
library(data.table)
library(plyr)
library(nycflights13)
library(datasets)
library(readxl)
library(magrittr)
library(maps)
library(plotly)
library(plyr)
library(GGally)
library(readxl)
library(plotly)
library(graphics)
library(e1071)
library(caTools)
library(ggplot2)
library(caret)
library(caretEnsemble)
library(psych)
library(GGally)
library(rpart)
library(randomForest)
library(readr)
library(vtable)
library(scales)
library(gridExtra)
library(corrplot)
library(nortest)
library(class)
library(randomForest)
library(foreign)
library(foreign)
library(GGally)
library(data.table)
library(plyr)
library(ggmap)
library(nycflights13)
library(datasets)
library(readxl)
library(DataExplorer)
library(maps)
library(plotly)
library(plyr)
library(GGally)
library(readxl)
library(plotly)
library(mice)
library(caTools)
library(lattice)
library(ggcorrplot)
library(usethis)
library(fastDummies)
library(recipes)
library(GGally)
library(caretEnsemble)
library(Amelia)
library(GGally)
library(randomForest)
library(readr)
library(aod)
library("gplots")
library(caret)
library(dominanceanalysis)
library(caTools)
library(randomForest) # for fitting RFs
library(skimr)
library(GGally)
library(plotly)
library(viridis)
library(caret)
library(randomForest)
library(rpart.plot)
library(corrgram)
library(h2o)
library(ggthemes)
library(treemap)
library(treemapify)
library(repr)
library(cowplot)
library(magrittr)
library(ggpubr)
library(RColorBrewer)
library(plotrix)
library(ggrepel)
library(forcats)
library(reshape2)
library(caTools)
library(tree)
library(rattle)
library(foreign)
library(haven)
library(ggplot2)
library(foreign)
library(ggplot2)
library(GGally)
library(haven)
library(magrittr)
library(data.table)
library(dplyr)
library(plyr)
library(dplyr)
library(factoextra)
library(ggplot2)
library(ggmap)
library(nycflights13)
library(tidyverse)
library(datasets)
library(readxl)
library(tidyverse) 
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(plyr)
library(gridExtra)
library(factoextra)
library(GGally)
library(readxl)
library(tidyverse) 
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(gridExtra)
library(factoextra)
library(GGally)
library(gridExtra)
library(graphics)
library(mice)
library(naivebayes)
library(e1071)
library(caTools)
library(lattice)
library(ggplot2)
library(tidyverse)
library(caret)
library(caretEnsemble)
library(psych)
library(Amelia)
library(mice)
library(GGally)
library(rpart)
library(randomForest)
library(scales)
library(readr)
options(repr.plot.width=8, repr.plot.height=6)
options(warn=-1)

Lets import the data sets

df=read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/CaseStudy2-data.csv")


cs2.NoAttrition =  read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Attrition.csv",stringsAsFactors = TRUE)


cs2.NoSalary = read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Salary.csv",stringsAsFactors = TRUE)

#visualize the missing data

sum(is.na(df))
## [1] 0
(sum(is.na(df))/prod(dim(df)))*100
## [1] 0

#================quick look at data and data processesing========

df %>% group_by(JobRole) %>% summarise(n=n()) %>% arrange(desc(n))
## # A tibble: 9 × 2
##   JobRole                       n
##   <chr>                     <int>
## 1 Sales Executive             200
## 2 Research Scientist          172
## 3 Laboratory Technician       153
## 4 Manufacturing Director       87
## 5 Healthcare Representative    76
## 6 Sales Representative         53
## 7 Manager                      51
## 8 Research Director            51
## 9 Human Resources              27
df$Educational_Levels <-  ifelse(df$Education == 1, "Without College D.",
                                 ifelse(df$Education == 2 , "College D.",
                                        ifelse(df$Education == 3, "Bachelors D.",
                                               ifelse(df$Education == 4, "Masters D.", "Phd D."))))

st(df)
Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
ID 870 435.5 251.292 1 218.25 652.75 870
Age 870 36.829 8.926 18 30 43 60
Attrition 870
… No 730 83.9%
… Yes 140 16.1%
BusinessTravel 870
… Non-Travel 94 10.8%
… Travel_Frequently 158 18.2%
… Travel_Rarely 618 71%
DailyRate 870 815.228 401.116 103 472.5 1165.75 1499
Department 870
… Human Resources 35 4%
… Research & Development 562 64.6%
… Sales 273 31.4%
DistanceFromHome 870 9.339 8.137 1 2 14 29
Education 870 2.901 1.023 1 2 4 5
EducationField 870
… Human Resources 15 1.7%
… Life Sciences 358 41.1%
… Marketing 100 11.5%
… Medical 270 31%
… Other 52 6%
… Technical Degree 75 8.6%
EmployeeCount 870 1 0 1 1 1 1
EmployeeNumber 870 1029.832 604.789 1 477.25 1561.5 2064
EnvironmentSatisfaction 870 2.701 1.099 1 2 4 4
Gender 870
… Female 354 40.7%
… Male 516 59.3%
HourlyRate 870 65.614 20.127 30 48 83 100
JobInvolvement 870 2.723 0.704 1 2 3 4
JobLevel 870 2.039 1.09 1 1 3 5
JobSatisfaction 870 2.709 1.115 1 2 4 4
MaritalStatus 870
… Divorced 191 22%
… Married 410 47.1%
… Single 269 30.9%
MonthlyIncome 870 6390.264 4597.696 1081 2839.5 8182 19999
MonthlyRate 870 14325.621 7108.382 2094 8092 20456.25 26997
NumCompaniesWorked 870 2.728 2.52 0 1 4 9
Over18 870
… Y 870 100%
OverTime 870
… No 618 71%
… Yes 252 29%
PercentSalaryHike 870 15.2 3.675 11 12 18 25
PerformanceRating 870 3.152 0.359 3 3 3 4
RelationshipSatisfaction 870 2.707 1.102 1 2 4 4
StandardHours 870 80 0 80 80 80 80
StockOptionLevel 870 0.784 0.858 0 0 1 3
TotalWorkingYears 870 11.053 7.514 0 6 15 40
TrainingTimesLastYear 870 2.832 1.273 0 2 3 6
WorkLifeBalance 870 2.782 0.712 1 2 3 4
YearsAtCompany 870 6.962 6.021 0 3 10 40
YearsInCurrentRole 870 4.205 3.639 0 2 7 18
YearsSinceLastPromotion 870 2.169 3.186 0 0 3 15
YearsWithCurrManager 870 4.14 3.574 0 2 7 17
Educational_Levels 870
… Bachelors D. 324 37.2%
… College D. 182 20.9%
… Masters D. 240 27.6%
… Phd D. 26 3%
… Without College D. 98 11.3%

Define the type of variables

df$WorkLifeBalance  =  as.factor(df$WorkLifeBalance)
df$JobRole =  as.factor(df$JobRole  )
df$JobInvolvement=as.factor(df$JobInvolvement)
df$JobSatisfaction=as.factor(df$JobSatisfaction)
df$JobLevel=as.factor(df$JobLevel)
df$JobSatisfaction =  as.factor(df$JobSatisfaction)
df$ TrainingTimesLastYear  =  as.factor(df$TrainingTimesLastYear)
df$ PerformanceRating =  as.factor(df$ PerformanceRating)
df$StockOptionLevel =  as.factor(df$StockOptionLevel)
df$RelationshipSatisfaction =  as.factor(df$RelationshipSatisfaction)
df$Education =  as.factor(df$Education)
df$EnvironmentSatisfaction=as.factor(df$EnvironmentSatisfaction)
df$BusinessTravel=as.factor(df$BusinessTravel)
df$JobSatisfaction=as.factor(df$JobSatisfaction)
df$EnvironmentSatisfaction=as.factor(df$EnvironmentSatisfaction)
df$PerformanceRating=as.factor(df$PerformanceRating)
df$TrainingTimesLastYear=as.factor(df$TrainingTimesLastYear)
df$RelationshipSatisfaction=as.factor(df$RelationshipSatisfaction)
df$WorkLifeBalance=as.factor(df$WorkLifeBalance)
df$Attrition <- as.factor(df$Attrition)
table(df$Attrition)
## 
##  No Yes 
## 730 140
cs2.NoAttrition$WorkLifeBalance  =  as.factor(cs2.NoAttrition$WorkLifeBalance)
cs2.NoAttrition$JobRole =  as.factor(cs2.NoAttrition$JobRole  )
cs2.NoAttrition$JobInvolvement=as.factor(cs2.NoAttrition$JobInvolvement)
cs2.NoAttrition$JobSatisfaction=as.factor(cs2.NoAttrition$JobSatisfaction)
cs2.NoAttrition$JobLevel=as.factor(cs2.NoAttrition$JobLevel)
cs2.NoAttrition$JobSatisfaction =  as.factor(cs2.NoAttrition$JobSatisfaction)
cs2.NoAttrition$ TrainingTimesLastYear  =  as.factor(cs2.NoAttrition$TrainingTimesLastYear)
cs2.NoAttrition$ PerformanceRating =  as.factor(cs2.NoAttrition$ PerformanceRating)
cs2.NoAttrition$StockOptionLevel =  as.factor(cs2.NoAttrition$StockOptionLevel)
cs2.NoAttrition$RelationshipSatisfaction =  as.factor(cs2.NoAttrition$RelationshipSatisfaction)
cs2.NoAttrition$Education =  as.factor(cs2.NoAttrition$Education)
cs2.NoAttrition$EnvironmentSatisfaction=as.factor(cs2.NoAttrition$EnvironmentSatisfaction)
cs2.NoAttrition$BusinessTravel=as.factor(cs2.NoAttrition$BusinessTravel)
cs2.NoAttrition$JobSatisfaction=as.factor(cs2.NoAttrition$JobSatisfaction)
cs2.NoAttrition$EnvironmentSatisfaction=as.factor(cs2.NoAttrition$EnvironmentSatisfaction)
cs2.NoAttrition$PerformanceRating=as.factor(cs2.NoAttrition$PerformanceRating)
cs2.NoAttrition$TrainingTimesLastYear=as.factor(cs2.NoAttrition$TrainingTimesLastYear)
cs2.NoAttrition$RelationshipSatisfaction=as.factor(cs2.NoAttrition$RelationshipSatisfaction)
cs2.NoAttrition$WorkLifeBalance=as.factor(cs2.NoAttrition$WorkLifeBalance)




cs2.NoSalary$WorkLifeBalance  =  as.factor(cs2.NoSalary$WorkLifeBalance)
cs2.NoSalary$JobRole =  as.factor(cs2.NoSalary$JobRole  )
cs2.NoSalary$JobInvolvement=as.factor(cs2.NoSalary$JobInvolvement)
cs2.NoSalary$JobSatisfaction=as.factor(cs2.NoSalary$JobSatisfaction)
cs2.NoSalary$JobLevel=as.factor(cs2.NoSalary$JobLevel)
cs2.NoSalary$JobSatisfaction =  as.factor(cs2.NoSalary$JobSatisfaction)
cs2.NoSalary$ TrainingTimesLastYear  =  as.factor(cs2.NoSalary$TrainingTimesLastYear)
cs2.NoSalary$ PerformanceRating =  as.factor(cs2.NoSalary$ PerformanceRating)
cs2.NoSalary$StockOptionLevel =  as.factor(cs2.NoSalary$StockOptionLevel)
cs2.NoSalary$RelationshipSatisfaction =  as.factor(cs2.NoSalary$RelationshipSatisfaction)
cs2.NoSalary$Education =  as.factor(cs2.NoSalary$Education)
cs2.NoSalary$EnvironmentSatisfaction=as.factor(cs2.NoSalary$EnvironmentSatisfaction)
cs2.NoSalary$BusinessTravel=as.factor(cs2.NoSalary$BusinessTravel)
cs2.NoSalary$JobSatisfaction=as.factor(cs2.NoSalary$JobSatisfaction)
cs2.NoSalary$EnvironmentSatisfaction=as.factor(cs2.NoSalary$EnvironmentSatisfaction)
cs2.NoSalary$PerformanceRating=as.factor(cs2.NoSalary$PerformanceRating)
cs2.NoSalary$TrainingTimesLastYear=as.factor(cs2.NoSalary$TrainingTimesLastYear)
cs2.NoSalary$RelationshipSatisfaction=as.factor(cs2.NoSalary$RelationshipSatisfaction)
cs2.NoSalary$WorkLifeBalance=as.factor(cs2.NoSalary$WorkLifeBalance)
cs2.NoSalary$Attrition <- as.factor(cs2.NoSalary$Attrition)

Attrition.Yes = subset(df, Attrition == "Yes")
Attrition.No = subset(df, Attrition == "No")

Plot data types

(d1 = as.data.frame(table(sapply(df, class))))
##        Var1 Freq
## 1 character    7
## 2    factor   13
## 3   integer   17
ggplot(d1, aes(x = Var1, y = Freq)) +
  geom_bar(stat = "identity", col = "blue", fill = "lightblue") +
  labs(x = "Type of Class", y = "Frequency", title = "Column type Frequency")+
  theme_bw()

st(df, group = 'Attrition', group.long = TRUE)
Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
Attrition: No
ID 730 430.301 251.324 1 213.25 645.25 870
Age 730 37.412 8.673 18 31 43 60
BusinessTravel 730
… Non-Travel 83 11.4%
… Travel_Frequently 123 16.8%
… Travel_Rarely 524 71.8%
DailyRate 730 821.16 401.414 111 483.75 1178.25 1499
Department 730
… Human Resources 29 4%
… Research & Development 487 66.7%
… Sales 214 29.3%
DistanceFromHome 730 9.029 7.983 1 2 13 29
Education 730
… 1 80 11%
… 2 150 20.5%
… 3 269 36.8%
… 4 208 28.5%
… 5 23 3.2%
EducationField 730
… Human Resources 11 1.5%
… Life Sciences 305 41.8%
… Marketing 80 11%
… Medical 233 31.9%
… Other 43 5.9%
… Technical Degree 58 7.9%
EmployeeCount 730 1 0 1 1 1 1
EmployeeNumber 730 1035.866 606.517 11 476.25 1571.5 2064
EnvironmentSatisfaction 730
… 1 130 17.8%
… 2 154 21.1%
… 3 223 30.5%
… 4 223 30.5%
Gender 730
… Female 301 41.2%
… Male 429 58.8%
HourlyRate 730 65.292 20.203 30 48 82.75 100
JobInvolvement 730
… 1 25 3.4%
… 2 184 25.2%
… 3 447 61.2%
… 4 74 10.1%
JobLevel 730
… 1 243 33.3%
… 2 282 38.6%
… 3 115 15.8%
… 4 57 7.8%
… 5 33 4.5%
JobRole 730
… Healthcare Representative 68 9.3%
… Human Resources 21 2.9%
… Laboratory Technician 123 16.8%
… Manager 47 6.4%
… Manufacturing Director 85 11.6%
… Research Director 50 6.8%
… Research Scientist 140 19.2%
… Sales Executive 167 22.9%
… Sales Representative 29 4%
JobSatisfaction 730
… 1 141 19.3%
… 2 135 18.5%
… 3 211 28.9%
… 4 243 33.3%
MaritalStatus 730
… Divorced 179 24.5%
… Married 352 48.2%
… Single 199 27.3%
MonthlyIncome 730 6702 4675.472 1129 3162 8736.5 19999
MonthlyRate 730 14460.123 7126.983 2094 8191.25 20644.75 26997
NumCompaniesWorked 730 2.66 2.466 0 1 4 9
Over18 730
… Y 730 100%
OverTime 730
… No 558 76.4%
… Yes 172 23.6%
PercentSalaryHike 730 15.175 3.627 11 12 18 25
PerformanceRating 730
… 3 621 85.1%
… 4 109 14.9%
RelationshipSatisfaction 730
… 1 139 19%
… 2 144 19.7%
… 3 225 30.8%
… 4 222 30.4%
StandardHours 730 80 0 80 80 80 80
StockOptionLevel 730
… 0 281 38.5%
… 1 328 44.9%
… 2 78 10.7%
… 3 43 5.9%
TotalWorkingYears 730 11.603 7.459 0 6 15 37
TrainingTimesLastYear 730
… 0 22 3%
… 1 34 4.7%
… 2 252 34.5%
… 3 265 36.3%
… 4 57 7.8%
… 5 68 9.3%
… 6 32 4.4%
WorkLifeBalance 730
… 1 31 4.2%
… 2 162 22.2%
… 3 452 61.9%
… 4 85 11.6%
YearsAtCompany 730 7.301 5.936 0 3 10 33
YearsInCurrentRole 730 4.453 3.645 0 2 7 18
YearsSinceLastPromotion 730 2.175 3.147 0 0 3 15
YearsWithCurrManager 730 4.37 3.591 0 2 7 17
Educational_Levels 730
… Bachelors D. 269 36.8%
… College D. 150 20.5%
… Masters D. 208 28.5%
… Phd D. 23 3.2%
… Without College D. 80 11%
Attrition: Yes
ID 140 462.607 250.266 28 259.25 687.5 863
Age 140 33.786 9.615 18 28 39 58
BusinessTravel 140
… Non-Travel 11 7.9%
… Travel_Frequently 35 25%
… Travel_Rarely 94 67.1%
DailyRate 140 784.293 399.564 103 428.75 1110.75 1496
Department 140
… Human Resources 6 4.3%
… Research & Development 75 53.6%
… Sales 59 42.1%
DistanceFromHome 140 10.957 8.749 1 3 19 29
Education 140
… 1 18 12.9%
… 2 32 22.9%
… 3 55 39.3%
… 4 32 22.9%
… 5 3 2.1%
EducationField 140
… Human Resources 4 2.9%
… Life Sciences 53 37.9%
… Marketing 20 14.3%
… Medical 37 26.4%
… Other 9 6.4%
… Technical Degree 17 12.1%
EmployeeCount 140 1 0 1 1 1 1
EmployeeNumber 140 998.371 596.858 1 483.25 1508.5 2027
EnvironmentSatisfaction 140
… 1 42 30%
… 2 24 17.1%
… 3 35 25%
… 4 39 27.9%
Gender 140
… Female 53 37.9%
… Male 87 62.1%
HourlyRate 140 67.293 19.712 32 51 84 100
JobInvolvement 140
… 1 22 15.7%
… 2 44 31.4%
… 3 67 47.9%
… 4 7 5%
JobLevel 140
… 1 86 61.4%
… 2 30 21.4%
… 3 17 12.1%
… 4 3 2.1%
… 5 4 2.9%
JobRole 140
… Healthcare Representative 8 5.7%
… Human Resources 6 4.3%
… Laboratory Technician 30 21.4%
… Manager 4 2.9%
… Manufacturing Director 2 1.4%
… Research Director 1 0.7%
… Research Scientist 32 22.9%
… Sales Executive 33 23.6%
… Sales Representative 24 17.1%
JobSatisfaction 140
… 1 38 27.1%
… 2 31 22.1%
… 3 43 30.7%
… 4 28 20%
MaritalStatus 140
… Divorced 12 8.6%
… Married 58 41.4%
… Single 70 50%
MonthlyIncome 140 4764.786 3786.389 1081 2341.5 5838.75 19859
MonthlyRate 140 13624.286 6993.816 2396 8054.25 19498 26959
NumCompaniesWorked 140 3.079 2.772 0 1 5 9
Over18 140
… Y 140 100%
OverTime 140
… No 60 42.9%
… Yes 80 57.1%
PercentSalaryHike 140 15.329 3.928 11 12 18 25
PerformanceRating 140
… 3 117 83.6%
… 4 23 16.4%
RelationshipSatisfaction 140
… 1 35 25%
… 2 27 19.3%
… 3 36 25.7%
… 4 42 30%
StandardHours 140 80 0 80 80 80 80
StockOptionLevel 140
… 0 98 70%
… 1 27 19.3%
… 2 3 2.1%
… 3 12 8.6%
TotalWorkingYears 140 8.186 7.162 0 3 10 40
TrainingTimesLastYear 140
… 0 8 5.7%
… 1 5 3.6%
… 2 57 40.7%
… 3 43 30.7%
… 4 16 11.4%
… 5 7 5%
… 6 4 2.9%
WorkLifeBalance 140
… 1 17 12.1%
… 2 30 21.4%
… 3 80 57.1%
… 4 13 9.3%
YearsAtCompany 140 5.193 6.171 0 1 8 40
YearsInCurrentRole 140 2.907 3.333 0 0 4 15
YearsSinceLastPromotion 140 2.136 3.395 0 0 2 15
YearsWithCurrManager 140 2.943 3.245 0 0 6 14
Educational_Levels 140
… Bachelors D. 55 39.3%
… College D. 32 22.9%
… Masters D. 32 22.9%
… Phd D. 3 2.1%
… Without College D. 18 12.9%

Let’s have a better understanding about each feature through a correlation plot

cols = c("#4c86ad", "#f5dfb3")
df %>%
  dplyr::select(Attrition,MonthlyIncome,YearsSinceLastPromotion,YearsWithCurrManager,YearsAtCompany,YearsInCurrentRole,TotalWorkingYears ) %>%
  GGally::ggpairs(
    lower = list(
      continuous = GGally::wrap("points", col = cols[1],alpha=0.6),
      combo = GGally::wrap("box", fill = "white", col ="black")
    ),
    upper = list(
      continuous = GGally::wrap("cor", col = cols[1]),
      combo = GGally::wrap("facetdensity", col = "black")
    ),
    diag = list(
      continuous = GGally::wrap("barDiag", fill = cols[2], col ="black", bins = 18),
      discrete = GGally::wrap("barDiag", fill = cols[2], col ="black"))
  )

cols = c("#4c86ad", "#f5dfb3")
df %>%
  dplyr::select(Attrition,PercentSalaryHike,MonthlyIncome,HourlyRate,MonthlyRate,DistanceFromHome ) %>%
  GGally::ggpairs(
    lower = list(
      continuous = GGally::wrap("points", col = cols[1],alpha=0.6),
      combo = GGally::wrap("box", fill = "white", col ="black")
    ),
    upper = list(
      continuous = GGally::wrap("cor", col = cols[1]),
      combo = GGally::wrap("facetdensity", col = "black")
    ),
    diag = list(
      continuous = GGally::wrap("barDiag", fill = cols[2], col ="black", bins = 18),
      discrete = GGally::wrap("barDiag", fill = cols[2], col ="black"))
  )

#EDA of bivariate data

#Monthly Income by Gender
Income_by_Gender <- ggplot(df, aes(x=Gender, y=MonthlyIncome, color=Gender, fill=Gender)) + geom_boxplot() + 
  scale_fill_manual(values=c("#F5A9F2", "#5882FA")) + scale_color_manual(values=c("#FE2EF7", "#5858FA")) +
  coord_flip() + labs(title="Are there any Gender Disparities in Income?")
Income_by_Gender

Plot the relationship between categorical variables with Attrition:

#-------More graphs to explore the data-----
ggplot(df, aes(OverTime, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobInvolvement, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(StockOptionLevel, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(NumCompaniesWorked, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = JobRole), position = "dodge")

ggplot(df, aes(WorkLifeBalance , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(TrainingTimesLastYear  , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(PerformanceRating  , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(StockOptionLevel   , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(RelationshipSatisfaction    , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(EnvironmentSatisfaction    , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(Education, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(BusinessTravel, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(EnvironmentSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(NumCompaniesWorked, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(PerformanceRating, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(RelationshipSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(TrainingTimesLastYear, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(WorkLifeBalance, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(OverTime, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobInvolvement, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(JobLevel, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

ggplot(df, aes(OverTime, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

#distribution of job satisfaction in attrition.

ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")

#————continious variables————-

#Evaluation the numeric variables in those with and without attrition using boxplot

ggplot(df, aes(x=Attrition, y=PercentSalaryHike)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=MonthlyIncome)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=JobRole, y=MonthlyIncome)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y= HourlyRate)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=JobSatisfaction, y= HourlyRate)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=JobSatisfaction, y=MonthlyIncome)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=DistanceFromHome)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=JobSatisfaction, y=DistanceFromHome)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=YearsSinceLastPromotion)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=YearsWithCurrManager)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=YearsAtCompany)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=YearsInCurrentRole)) +  geom_boxplot(fill='green')

ggplot(df, aes(x=Attrition, y=TotalWorkingYears)) +  geom_boxplot(fill='green')

#Evaluation the numeric variables in those with and without attrition using scatterplot

ggplot(df,aes(TotalWorkingYears,MonthlyIncome,color=Attrition))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=lm,se=F)

ggplot(df,aes(YearsInCurrentRole,MonthlyIncome,color=Attrition))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=loess,se=F)

ggplot(df,aes(YearsAtCompany,MonthlyIncome,color=Attrition))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=loess,se=F)

ggplot(df,aes(YearsSinceLastPromotion,YearsInCurrentRole,color=Attrition))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=loess,se=F)

ggplot(df,aes(YearsSinceLastPromotion,YearsInCurrentRole,color=Attrition))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=loess,se=F)

ggplot(df,aes(TotalWorkingYears,YearsSinceLastPromotion,color=Attrition))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=loess,se=F)

Evaluation of monthly income using histogram

### 1. Monthly Income Variable
ggplot(df, aes(x = MonthlyIncome)) + 
  geom_histogram(aes(y = stat(density)), col = "blue", fill = "gold") + 
  geom_density(col = "red", size = 1) + 
  labs(x = "Monthly Income (Salary)",y = " ",
       title = "Histogram for Monthly Income") +
  theme_bw()

range(df$MonthlyIncome)
## [1]  1081 19999
# Monthly Income variable is not normally distributed and variable is positively skewed.

2. Attrition variable

ggplot(df, aes(x = Attrition, y = prop.table(stat(count)),
               label = scales::percent(prop.table(stat(count))))) + 
  geom_bar(col = "blue", fill = "lightblue")+
  geom_text(stat = 'count', size = 5)+
  labs(y = "Frequency", title = "Barplot for Attrition") +
  theme_bw()

# In this data set 16% percent belongs to Attrition Yes group.

3. Monthly Income and Attrition

ggplot(df, aes(x = Attrition, y = MonthlyIncome)) + 
  geom_boxplot(col = "blue", fill = "gold") + 
  labs(y = "Monthly Income (Salary)",x = "Attrition",
       title = "Boxplots of Monthly Income for Attrition groups") +
  theme_bw()

# According to the median values, employees who have lower salary tends to leave their current jobs.

4. Monthly Income and Age with Attrition

ggplot(df, aes(x = Age, y = MonthlyIncome)) + 
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(y = "Monthly Income (Salary)",x = "Age",
       title = "Scatter plot for Monthly Income vs Age for Attrition") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(df$Age, df$MonthlyIncome)
## [1] 0.4842883
# There is a positive relationship between Monthly income and Age for both groups. When Age increases, Monthly income also increase. 

5. Monthly Income and Business Travel groups | Attrition and Business Travel groups

ggplot(df, aes(x = BusinessTravel, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Business Travel", y = "Attrition",
       title = "Business Travel with Attrition") + 
  theme_bw()

ggplot(df, aes(x = BusinessTravel, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Business Travel", y = "Montly Income",
       title = "Montly Income for Business Travel groups") +
  theme_bw()

# Employees who are travel frequently have highest attrition rates and Non- travel group has lowest Income.

6. Hourly Rate, Daily Rate and Monthly Rate with Monthly Income and Attrition

p1 = ggplot(df, aes(x = HourlyRate, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Hourly Rate", y = "Monthly Income", title = "Monthly Income vs Hourly Rate") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

p2 = ggplot(df, aes(x = DailyRate, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Daily Rate", y = "Monthly Income", title = "Monthly Income vs Daily Rate") +
  theme_bw() +
  facet_wrap( ~ Attrition)

p3 = ggplot(df, aes(x = MonthlyRate, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) + 
  labs(x = "Monthly Rate", y = "Monthly Income", title = "Monthly Income vs Monthly Rate") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

grid.arrange(p1,p2,p3)

round(cor(Attrition.Yes[c("HourlyRate","DailyRate","MonthlyRate","MonthlyIncome","DistanceFromHome","TotalWorkingYears","YearsInCurrentRole","YearsAtCompany","YearsWithCurrManager","YearsSinceLastPromotion")]),2)
##                         HourlyRate DailyRate MonthlyRate MonthlyIncome
## HourlyRate                    1.00      0.19       -0.01          0.07
## DailyRate                     0.19      1.00       -0.03          0.13
## MonthlyRate                  -0.01     -0.03        1.00          0.13
## MonthlyIncome                 0.07      0.13        0.13          1.00
## DistanceFromHome              0.03     -0.08       -0.05         -0.01
## TotalWorkingYears             0.08      0.09       -0.02          0.74
## YearsInCurrentRole           -0.04     -0.05       -0.05          0.58
## YearsAtCompany                0.01     -0.06       -0.06          0.65
## YearsWithCurrManager         -0.03      0.01       -0.10          0.50
## YearsSinceLastPromotion      -0.04     -0.07       -0.04          0.47
##                         DistanceFromHome TotalWorkingYears YearsInCurrentRole
## HourlyRate                          0.03              0.08              -0.04
## DailyRate                          -0.08              0.09              -0.05
## MonthlyRate                        -0.05             -0.02              -0.05
## MonthlyIncome                      -0.01              0.74               0.58
## DistanceFromHome                    1.00              0.01              -0.01
## TotalWorkingYears                   0.01              1.00               0.64
## YearsInCurrentRole                 -0.01              0.64               1.00
## YearsAtCompany                      0.01              0.78               0.82
## YearsWithCurrManager               -0.01              0.62               0.81
## YearsSinceLastPromotion             0.04              0.59               0.67
##                         YearsAtCompany YearsWithCurrManager
## HourlyRate                        0.01                -0.03
## DailyRate                        -0.06                 0.01
## MonthlyRate                      -0.06                -0.10
## MonthlyIncome                     0.65                 0.50
## DistanceFromHome                  0.01                -0.01
## TotalWorkingYears                 0.78                 0.62
## YearsInCurrentRole                0.82                 0.81
## YearsAtCompany                    1.00                 0.77
## YearsWithCurrManager              0.77                 1.00
## YearsSinceLastPromotion           0.75                 0.69
##                         YearsSinceLastPromotion
## HourlyRate                                -0.04
## DailyRate                                 -0.07
## MonthlyRate                               -0.04
## MonthlyIncome                              0.47
## DistanceFromHome                           0.04
## TotalWorkingYears                          0.59
## YearsInCurrentRole                         0.67
## YearsAtCompany                             0.75
## YearsWithCurrManager                       0.69
## YearsSinceLastPromotion                    1.00
round(cor(Attrition.No[c("HourlyRate","DailyRate","MonthlyRate","MonthlyIncome","DistanceFromHome","TotalWorkingYears","YearsInCurrentRole","YearsAtCompany","YearsWithCurrManager","YearsSinceLastPromotion")]),2)
##                         HourlyRate DailyRate MonthlyRate MonthlyIncome
## HourlyRate                    1.00      0.02       -0.02          0.00
## DailyRate                     0.02      1.00       -0.03         -0.03
## MonthlyRate                  -0.02     -0.03        1.00          0.05
## MonthlyIncome                 0.00     -0.03        0.05          1.00
## DistanceFromHome              0.07      0.04        0.01          0.01
## TotalWorkingYears             0.03     -0.03        0.06          0.78
## YearsInCurrentRole            0.01      0.00        0.03          0.31
## YearsAtCompany                0.00     -0.04       -0.02          0.46
## YearsWithCurrManager          0.01     -0.04       -0.02          0.29
## YearsSinceLastPromotion       0.02     -0.06        0.02          0.30
##                         DistanceFromHome TotalWorkingYears YearsInCurrentRole
## HourlyRate                          0.07              0.03               0.01
## DailyRate                           0.04             -0.03               0.00
## MonthlyRate                         0.01              0.06               0.03
## MonthlyIncome                       0.01              0.78               0.31
## DistanceFromHome                    1.00              0.02               0.01
## TotalWorkingYears                   0.02              1.00               0.45
## YearsInCurrentRole                  0.01              0.45               1.00
## YearsAtCompany                     -0.01              0.60               0.76
## YearsWithCurrManager                0.00              0.42               0.69
## YearsSinceLastPromotion            -0.03              0.43               0.54
##                         YearsAtCompany YearsWithCurrManager
## HourlyRate                        0.00                 0.01
## DailyRate                        -0.04                -0.04
## MonthlyRate                      -0.02                -0.02
## MonthlyIncome                     0.46                 0.29
## DistanceFromHome                 -0.01                 0.00
## TotalWorkingYears                 0.60                 0.42
## YearsInCurrentRole                0.76                 0.69
## YearsAtCompany                    1.00                 0.76
## YearsWithCurrManager              0.76                 1.00
## YearsSinceLastPromotion           0.63                 0.48
##                         YearsSinceLastPromotion
## HourlyRate                                 0.02
## DailyRate                                 -0.06
## MonthlyRate                                0.02
## MonthlyIncome                              0.30
## DistanceFromHome                          -0.03
## TotalWorkingYears                          0.43
## YearsInCurrentRole                         0.54
## YearsAtCompany                             0.63
## YearsWithCurrManager                       0.48
## YearsSinceLastPromotion                    1.00
# There is no relationship between following variables for both Attrition groups.
# 1. Hourly Rate and Monthly Income
# 2. Daily Rate and Monthly Income
# 3. Monthly Rate and Monthly Income

7. Monthly Income and Department | Attrition and Department

ggplot(df, aes(x = Department, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Department", y = "Attrition",
       title = "Department with Attrition") + 
  theme_bw()

ggplot(df, aes(x = Department, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Department", y = "Montly Income",
       title = "Montly Income for Departments") +
  theme_bw()

# Higher attrition rate has sales department and HR department has lowest median for monthly income.

8. Distance from home with Monthly Income and Attrition

ggplot(df, aes(x = DistanceFromHome, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Distance From Home", y = "Monthly Income", 
       title = "Monthly Income vs Distance From Home") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$DistanceFromHome, Attrition.Yes$MonthlyIncome)
## [1] -0.01230388
cor(Attrition.No$DistanceFromHome, Attrition.No$MonthlyIncome)
## [1] 0.0102446

There is no relationship between Distance From Home and Monthly Income for both Attrition groups.

9. Monthly Income and Education | Attrition and Education

df$Education = as.factor(df$Education)

ggplot(df, aes(x = Education, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Education", y = "Attrition",
       title = "Education with Attrition") + 
  theme_bw()

ggplot(df, aes(x = Education, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Education", y = "Montly Income",
       title = "Montly Income for Education") +
  theme_bw()

# Highest attrition rates has education level 1. 
# Highest median income has education level 5 group.

10. Monthly Income and Education Filed | Attrition and Education Filed

ggplot(df, aes(x = EducationField, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Education Filed", y = "Attrition",
       title = "Education Filed with Attrition") + 
  theme_bw()

ggplot(df, aes(x = EducationField, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Education Filed", y = "Montly Income",
       title = "Montly Income for Education Filed") +
  theme_bw()

# Highest attrition rates has Human Resource education field. 
# Highest median income has Marketing group.

11. Monthly Income and Environment Satisfaction | Attrition and Environment Satisfaction

df$EnvironmentSatisfaction = as.factor(df$EnvironmentSatisfaction)
ggplot(df, aes(x = EnvironmentSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Environment Satisfaction", y = "Attrition",
       title = "Environment Satisfaction with Attrition") + 
  theme_bw()

ggplot(df, aes(x = EnvironmentSatisfaction, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Environment Satisfaction", y = "Montly Income",
       title = "Montly Income for Environment Satisfaction") +
  theme_bw()

# Employees who are less satisfy about their environment has higher attrition rates. 
# Median incomes are very similar in all satisfaction levels.

12. Monthly Income and Gender | Attrition and Gender

ggplot(df, aes(x = Gender, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Gender", y = "Attrition",
       title = "Gender with Attrition") + 
  theme_bw()

ggplot(df, aes(x = Gender, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Gender", y = "Montly Income",
       title = "Montly Income for Gender") +
  theme_bw()

# Both male and female group has similar attrition rates.

13. Monthly Income and Job Involvement | Attrition and Job Involvement

df$JobInvolvement = as.factor(df$JobInvolvement)

ggplot(df, aes(x = JobInvolvement, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Job Involvement", y = "Attrition",
       title = "Job Involvement with Attrition") + 
  theme_bw()

ggplot(df, aes(x = JobInvolvement, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Job Involvement", y = "Montly Income",
       title = "Montly Income for Job Involvement") +
  theme_bw()

# Employees with lower job involvement, have very higher attrition rate.

14. Monthly Income and Job Involvement | Attrition and Job Involvement

df$JobLevel = as.factor(df$JobLevel)

ggplot(df, aes(x = JobLevel, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Job Level", y = "Attrition",
       title = "Job Level with Attrition") + 
  theme_bw()

ggplot(df, aes(x = JobLevel, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Job Level", y = "Montly Income",
       title = "Montly Income for Job Level") +
  theme_bw()

# Employees who are in job level 1 have highest attrition rate.
# When job level increases, monthly income also increases.

15. Monthly Income and Job Role | Attrition and Job Role

ggplot(df, aes(x = JobRole, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Job Role", y = "Attrition",
       title = "Job Role with Attrition") + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

ggplot(df, aes(x = JobRole, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Job Role", y = "Montly Income",
       title = "Montly Income for Job Role") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

# Sales representatives have higher attrition rate and lowest income.
# Managers abd Research directors have highest monthly income and lower attrition rate.

16. Monthly Income and Job Satisfaction | Attrition and Job Satisfaction

df$JobSatisfaction = as.factor(df$JobSatisfaction)

ggplot(df, aes(x = JobSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Job Satisfaction", y = "Attrition",
       title = "Job Satisfaction with Attrition") + 
  theme_bw() 

ggplot(df, aes(x = JobSatisfaction, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Job Satisfaction", y = "Montly Income",
       title = "Montly Income for Job Satisfaction") +
  theme_bw() 

# Employees who are less satisfy with their jobs have higher attrition rates.

17. Monthly Income and Marital Status | Attrition and Marital Status

ggplot(df, aes(x = MaritalStatus, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Marital Status", y = "Attrition",
       title = "Marital Status with Attrition") + 
  theme_bw() 

ggplot(df, aes(x = MaritalStatus, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Marital Status", y = "Montly Income",
       title = "Montly Income for Marital Status") +
  theme_bw() 

# Single employees have highest attrition rates.

18. Monthly Income and Over Time | Attrition and Over Time

ggplot(df, aes(x = OverTime, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Over Time", y = "Attrition",
       title = "Over Time with Attrition") + 
  theme_bw() 

ggplot(df, aes(x = OverTime, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Over Time", y = "Montly Income",
       title = "Montly Income for Over Time") +
  theme_bw() 

# Employees who work over time have higher attrition rate and lower monthly income.

19. Number of Companies Worked with Monthly Income and Attrition

ggplot(df, aes(x = NumCompaniesWorked, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Number of Companies Worked", y = "Monthly Income", 
       title = "Monthly Income vs Number of Companies Worked") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$NumCompaniesWorked, Attrition.Yes$MonthlyIncome)
## [1] 0.1007262
cor(Attrition.No$NumCompaniesWorked, Attrition.No$MonthlyIncome)
## [1] 0.180646
# There is a poor relationship between Monthly income and number of companies worked.

20. Percent Salary Hike with Monthly Income and Attrition

ggplot(df, aes(x = PercentSalaryHike, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Percent Salary Hike", y = "Monthly Income", 
       title = "Monthly Income vs Percent Salary Hike") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$PercentSalaryHike, Attrition.Yes$MonthlyIncome)
## [1] -0.1406169
cor(Attrition.No$PercentSalaryHike, Attrition.No$MonthlyIncome)
## [1] -0.03764575
# It seems that Percent Salary Hike increases, monthly income decrease.

21. Monthly Income and Relationship Satisfaction | Attrition and Relationship Satisfaction

df$RelationshipSatisfaction = as.factor(df$RelationshipSatisfaction)

ggplot(df, aes(x = RelationshipSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Relationship Satisfaction", y = "Attrition",
       title = "Relationship Satisfaction with Attrition") + 
  theme_bw() 

ggplot(df, aes(x = RelationshipSatisfaction, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Relationship Satisfaction", y = "Montly Income",
       title = "Montly Income for Relationship Satisfaction") +
  theme_bw() 

# Employees who have low relationship satisfaction, have a higher attrition rate. But median incomes are similar in all levels.

22. Monthly Income and Stock Option Level | Attrition and Stock Option Level

df$StockOptionLevel = as.factor(df$StockOptionLevel)

ggplot(df, aes(x = StockOptionLevel, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Stock Option Level", y = "Attrition",
       title = "Stock Option Level with Attrition") + 
  theme_bw() 

ggplot(df, aes(x = StockOptionLevel, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Stock Option Level", y = "Montly Income",
       title = "Montly Income for Stock Option Level") +
  theme_bw() 

# Stock option levels 0 and 3 have higher attrition rates and lower median incomes.

23. Total Working Years with Monthly Income and Attrition

ggplot(df, aes(x = TotalWorkingYears, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Total Working Years", y = "Monthly Income", 
       title = "Monthly Income vs Total Working Years") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$TotalWorkingYears, Attrition.Yes$MonthlyIncome)
## [1] 0.7360898
cor(Attrition.No$TotalWorkingYears, Attrition.No$MonthlyIncome)
## [1] 0.7795562
# There is strong positive relationship between Total Working Years and Monthly Income both attrition groups.

24. Training Times Last Year with Monthly Income and Attrition

ggplot(df, aes(x = TrainingTimesLastYear, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Training Times Last Year", y = "Monthly Income", 
       title = "Monthly Income vs Training Times Last Year") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

25. Monthly Income and Work Life Balance | Attrition and Work Life Balance

df$WorkLifeBalance = as.factor(df$WorkLifeBalance)

ggplot(df, aes(x = WorkLifeBalance, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Work Life Balance", y = "Attrition",
       title = "Work Life Balance with Attrition") + 
  theme_bw() 

ggplot(df, aes(x = WorkLifeBalance, y = MonthlyIncome)) +
  geom_boxplot(fill = "gold") + 
  labs(x = "Work Life Balance", y = "Montly Income",
       title = "Montly Income for Work Life Balance") +
  theme_bw() 

# Employees who have poor work life balance have higher attrition rate and lowest median income.

26. Years At Company with Monthly Income and Attrition

ggplot(df, aes(x = YearsAtCompany, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Years At Company", y = "Monthly Income", 
       title = "Monthly Income vs Years At Company") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$YearsAtCompany, Attrition.Yes$MonthlyIncome)
## [1] 0.6450931
cor(Attrition.No$YearsAtCompany, Attrition.No$MonthlyIncome)
## [1] 0.456972
# There is higher positive relationship between Years At Company and Monthly Income both attrition groups.

27. Years In Current Role with Monthly Income and Attrition

ggplot(df, aes(x = YearsInCurrentRole, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Years In Current Role", y = "Monthly Income", 
       title = "Monthly Income vs Years In Current Role") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$YearsInCurrentRole, Attrition.Yes$MonthlyIncome)
## [1] 0.5768375
cor(Attrition.No$YearsInCurrentRole, Attrition.No$MonthlyIncome)
## [1] 0.3137409
# There is positive relationship between Years In Current Role and Monthly Income both attrition groups.
# Employees who work more than 15 years in current role, do not leave the company.

28. Years Since Last Promotion with Monthly Income and Attrition

ggplot(df, aes(x = YearsSinceLastPromotion, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Years Since Last Promotion", y = "Monthly Income", 
       title = "Monthly Income vs Years Since Last Promotion") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$YearsSinceLastPromotion, Attrition.Yes$MonthlyIncome)
## [1] 0.4735753
cor(Attrition.No$YearsSinceLastPromotion, Attrition.No$MonthlyIncome)
## [1] 0.2951971
# There is positive relationship between Years Since Last Promotion and Monthly Income both attrition groups.

29. Years With Current Manager with Monthly Income and Attrition

ggplot(df, aes(x = YearsWithCurrManager, y = MonthlyIncome)) +
  geom_point(col = "blue") + 
  geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
  labs(x = "Years With Current Manager", y = "Monthly Income", 
       title = "Monthly Income vs Years With Current Manager") +
  theme_bw() + 
  facet_wrap( ~ Attrition)

cor(Attrition.Yes$YearsWithCurrManager, Attrition.Yes$MonthlyIncome)
## [1] 0.4959712
cor(Attrition.No$YearsWithCurrManager, Attrition.No$MonthlyIncome)
## [1] 0.2875556

#———evaluate the normality of variables———-

shapiro.test(df$MonthlyRate)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$MonthlyRate
## W = 0.9549, p-value = 1.083e-15
shapiro.test(df$PercentSalaryHike)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$PercentSalaryHike
## W = 0.89909, p-value < 2.2e-16
shapiro.test(df$MonthlyIncome)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$MonthlyIncome
## W = 0.83195, p-value < 2.2e-16
shapiro.test(df$HourlyRate)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$HourlyRate
## W = 0.95517, p-value = 1.221e-15
shapiro.test(df$YearsSinceLastPromotion)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$YearsSinceLastPromotion
## W = 0.70474, p-value < 2.2e-16
shapiro.test(df$YearsWithCurrManager)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$YearsWithCurrManager
## W = 0.89891, p-value < 2.2e-16
shapiro.test(df$YearsAtCompany)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$YearsAtCompany
## W = 0.85504, p-value < 2.2e-16
shapiro.test(df$YearsInCurrentRole)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$YearsInCurrentRole
## W = 0.89509, p-value < 2.2e-16
shapiro.test(df$TotalWorkingYears)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$TotalWorkingYears
## W = 0.90948, p-value < 2.2e-16
shapiro.test(df$NumCompaniesWorked)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$NumCompaniesWorked
## W = 0.84746, p-value < 2.2e-16

#———————————- # Man Whitney test to evaluate the difference between numeric variables in people with and without Attrition:

wilcox.test(NumCompaniesWorked ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  NumCompaniesWorked by Attrition
## W = 47486, p-value = 0.1723
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(MonthlyIncome ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  MonthlyIncome by Attrition
## W = 67118, p-value = 4.074e-09
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(PercentSalaryHike ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  PercentSalaryHike by Attrition
## W = 51018, p-value = 0.976
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(HourlyRate ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HourlyRate by Attrition
## W = 48218, p-value = 0.2901
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(DistanceFromHome ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  DistanceFromHome by Attrition
## W = 45107, p-value = 0.02725
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsSinceLastPromotion ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsSinceLastPromotion by Attrition
## W = 53456, p-value = 0.3681
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsWithCurrManager ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsWithCurrManager by Attrition
## W = 64295, p-value = 9.347e-07
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsAtCompany ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsAtCompany by Attrition
## W = 66124, p-value = 3.11e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsInCurrentRole ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsInCurrentRole by Attrition
## W = 65436, p-value = 9.482e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(TotalWorkingYears ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  TotalWorkingYears by Attrition
## W = 67078, p-value = 4.042e-09
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(MonthlyRate ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  MonthlyRate by Attrition
## W = 54526, p-value = 0.2086
## alternative hypothesis: true location shift is not equal to 0
#MonthlyIncome(p-value = 4.074e-09),DistanceFromHome(p-value = 0.02725),
#YearsWithCurrManager(p-value = 9.347e-07),YearsAtCompany(p-value = 3.11e-08),
#YearsInCurrentRole(p-value =9.482e-08),TotalWorkingYears(p-value = 4.042e-09) 
#were different with those with Attrition and those without Attrition

Evaluate the relationship between categorical variables with Attrition using chi chisquare test

chisq.test(df$JobSatisfaction,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$JobSatisfaction and df$Attrition
## X-squared = 11.109, df = 3, p-value = 0.01115
chisq.test(df$OverTime,df$Attrition)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df$OverTime and df$Attrition
## X-squared = 62.762, df = 1, p-value = 2.333e-15
chisq.test(df$JobInvolvement,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$JobInvolvement and df$Attrition
## X-squared = 41.465, df = 3, p-value = 5.211e-09
chisq.test(df$StockOptionLevel,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$StockOptionLevel and df$Attrition
## X-squared = 56.245, df = 3, p-value = 3.724e-12
chisq.test(df$NumCompaniesWorked,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$NumCompaniesWorked and df$Attrition
## X-squared = 20.19, df = 9, p-value = 0.01678
chisq.test(df$WorkLifeBalance,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$WorkLifeBalance and df$Attrition
## X-squared = 14.325, df = 3, p-value = 0.002495
chisq.test(df$TrainingTimesLastYear,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$TrainingTimesLastYear and df$Attrition
## X-squared = 10.132, df = 6, p-value = 0.1192
chisq.test(df$PerformanceRating,df$Attrition)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df$PerformanceRating and df$Attrition
## X-squared = 0.10478, df = 1, p-value = 0.7462
chisq.test(df$RelationshipSatisfaction,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$RelationshipSatisfaction and df$Attrition
## X-squared = 3.1253, df = 3, p-value = 0.3727
chisq.test(df$EnvironmentSatisfaction,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$EnvironmentSatisfaction and df$Attrition
## X-squared = 11.231, df = 3, p-value = 0.01054
chisq.test(df$JobLevel,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$JobLevel and df$Attrition
## X-squared = 41.533, df = 4, p-value = 2.085e-08
chisq.test(df$JobRole,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$JobRole and df$Attrition
## X-squared = 60.543, df = 8, p-value = 3.647e-10
chisq.test(df$Department,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$Department and df$Attrition
## X-squared = 9.329, df = 2, p-value = 0.009424
chisq.test(df$NumCompaniesWorked,df$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  df$NumCompaniesWorked and df$Attrition
## X-squared = 20.19, df = 9, p-value = 0.01678
#JobSatisfaction(p-value = 0.01115), OverTime(p-value = 2.333e-15),JobInvolvement(p-value = 5.211e-09),
# StockOptionLevel(p-value = 3.724e-12),WorkLifeBalance(p-value = 0.002495),df$EnvironmentSatisfaction(p-value = 0.01054) 
#Department(p-value = 0.009424),df$JobRole(p-value = 3.647e-10),NumCompaniesWorked(p-value = 0.01678)

Evaluation the correlations with Monthly Income

cor.test(df$Age, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$Age and df$MonthlyIncome
## S = 60400498, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4496556
#There is a positive correlation between age and income(R2=0.44,p-value < 2.2e-16)
cor.test(df$PercentSalaryHike, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$PercentSalaryHike and df$MonthlyIncome
## S = 116154632, p-value = 0.0854
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.05835313
#There is a weak correlation between PercentSalaryHike and income
cor.test(df$HourlyRate, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$HourlyRate and df$MonthlyIncome
## S = 110777571, p-value = 0.7828
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.00935957
#There is a weak correlation between HourlyRate and income
cor.test(df$DistanceFromHome, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$DistanceFromHome and df$MonthlyIncome
## S = 107844895, p-value = 0.6091
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.01736176
#There is a weak correlation between DistanceFromHome and income
cor.test(df$YearsSinceLastPromotion, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$YearsSinceLastPromotion and df$MonthlyIncome
## S = 80301098, p-value = 8.191e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2683295
#There is a positive correlation between YearsSinceLastPromotion and income(R2=0.2683295,p-value = 8.191e-16)
cor.test(df$YearsWithCurrManager, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$YearsWithCurrManager and df$MonthlyIncome
## S = 69749771, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3644688
#There is a positive correlation between YearsWithCurrManager and income(R2=0.3644688, p-value < 2.2e-16)
cor.test(df$YearsAtCompany, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$YearsAtCompany and df$MonthlyIncome
## S = 59312414, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4595697
#There is a positive correlation between YearsAtCompany and income(R2=0.4595697 , p-value < 2.2e-16)
cor.test(df$YearsInCurrentRole, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$YearsInCurrentRole and df$MonthlyIncome
## S = 65683339, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4015205
#There is a positive correlation between YearsInCurrentRole and income(R2=0.4015205  , p-value < 2.2e-16)
cor.test(df$TotalWorkingYears, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$TotalWorkingYears and df$MonthlyIncome
## S = 31181970, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7158827
#There is a strong positive correlation between TotalWorkingYears and income(R2=0.7158827   , p-value < 2.2e-16)
cor.test(df$MonthlyRate, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$MonthlyRate and df$MonthlyIncome
## S = 100095425, p-value = 0.009429
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.08797174
#-------------------------------

Since MonthlyIncome variable is not normally distributed we use Kruskal-Wallis tests and Mann whitney U test for following analysis.

1. Monthly Income vs Attrition

wilcox.test(MonthlyIncome ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  MonthlyIncome by Attrition
## W = 67118, p-value = 4.074e-09
## alternative hypothesis: true location shift is not equal to 0
#Monthlyincome  signifficantly different in people with or without Attrition d
#p-value = 4.074e-09


### 2. Monthly Income vs BusinessTravel
kruskal.test(MonthlyIncome ~ BusinessTravel,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by BusinessTravel
## Kruskal-Wallis chi-squared = 2.2416, df = 2, p-value = 0.326
kruskal.test(MonthlyIncome ~ NumCompaniesWorked,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by NumCompaniesWorked
## Kruskal-Wallis chi-squared = 54.478, df = 9, p-value = 1.531e-08
### 3. Monthly Income vs Department
kruskal.test(MonthlyIncome ~ Department,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by Department
## Kruskal-Wallis chi-squared = 25.546, df = 2, p-value = 2.836e-06
#Monthly income has a significant relationship with Department(p-value = 2.836e-06)


### 4. Monthly Income vs EducationField
kruskal.test(MonthlyIncome ~ EducationField,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by EducationField
## Kruskal-Wallis chi-squared = 15.544, df = 5, p-value = 0.008274
#Monthly income has a significant relationship with EducationField(p-value = 0.008274)


### 5. Monthly Income vs Gender
wilcox.test(MonthlyIncome ~ Gender,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  MonthlyIncome by Gender
## W = 98590, p-value = 0.04623
## alternative hypothesis: true location shift is not equal to 0
#p-value = 0.04623


### 6. Monthly Income vs JobRole
kruskal.test(MonthlyIncome ~ JobRole,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by JobRole
## Kruskal-Wallis chi-squared = 636.1, df = 8, p-value < 2.2e-16
#p-value < 2.2e-16


### 7. Monthly Income vs MaritalStatus
kruskal.test(MonthlyIncome ~ MaritalStatus,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by MaritalStatus
## Kruskal-Wallis chi-squared = 9.358, df = 2, p-value = 0.009288
#p-value = 0.009288


### 8. Monthly Income vs OverTime
wilcox.test(MonthlyIncome ~ OverTime,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  MonthlyIncome by OverTime
## W = 79554, p-value = 0.6161
## alternative hypothesis: true location shift is not equal to 0
### 9. Monthly Income vs Education
kruskal.test(MonthlyIncome ~ Education,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by Education
## Kruskal-Wallis chi-squared = 20.448, df = 4, p-value = 0.0004072
#p-value = 0.0004072

### 10. Monthly Income vs EnvironmentSatisfaction
kruskal.test(MonthlyIncome ~ EnvironmentSatisfaction,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by EnvironmentSatisfaction
## Kruskal-Wallis chi-squared = 1.4961, df = 3, p-value = 0.6832
### 11. Monthly Income vs JobInvolvement
kruskal.test(MonthlyIncome ~ JobInvolvement,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by JobInvolvement
## Kruskal-Wallis chi-squared = 0.24444, df = 3, p-value = 0.9701
### 12. Monthly Income vs JobLevel
kruskal.test(MonthlyIncome ~ JobLevel,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by JobLevel
## Kruskal-Wallis chi-squared = 744.02, df = 4, p-value < 2.2e-16
#p-value < 2.2e-16

### 13. Monthly Income vs JobSatisfaction
kruskal.test(MonthlyIncome ~ JobSatisfaction,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by JobSatisfaction
## Kruskal-Wallis chi-squared = 1.3648, df = 3, p-value = 0.7138
### 14. Monthly Income vs RelationshipSatisfaction
kruskal.test(MonthlyIncome ~ RelationshipSatisfaction,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by RelationshipSatisfaction
## Kruskal-Wallis chi-squared = 1.4622, df = 3, p-value = 0.691
### 15. Monthly Income vs StockOptionLevel
kruskal.test(MonthlyIncome ~ StockOptionLevel,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by StockOptionLevel
## Kruskal-Wallis chi-squared = 8.8154, df = 3, p-value = 0.03185
#p-value = 0.03185

### 16. Monthly Income vs WorkLifeBalance
kruskal.test(MonthlyIncome ~ WorkLifeBalance,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by WorkLifeBalance
## Kruskal-Wallis chi-squared = 1.1367, df = 3, p-value = 0.7682
####17. Monthly Income vs NumCompaniesWorked
kruskal.test(MonthlyIncome ~ NumCompaniesWorked,data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by NumCompaniesWorked
## Kruskal-Wallis chi-squared = 54.478, df = 9, p-value = 1.531e-08
#p-value = 1.531e-08

# Median monthly incomes are not significantly differ in BusinessTravel, EnvironmentSatisfaction,
# OverTime,JobInvolvement,JobSatisfaction, RelationshipSatisfaction and WorkLifeBalance variables.


#Monthly income had a significant relationship with StockOptionLevel,JobLevel, Education,
#MaritalStatus, JobRole, Gender, EducationField, Department, Attrition,
#NumCompaniesWorked

add some new variables to datasets increase the accuracy

df=df%>%
  mutate(year2=YearsInCurrentRole/YearsAtCompany)%>%
  mutate(year4=YearsInCurrentRole/TotalWorkingYears)%>%
  mutate(year5=df$YearsAtCompany/TotalWorkingYears)




cs2.NoAttrition=cs2.NoAttrition%>%
  mutate(year2=cs2.NoAttrition$YearsInCurrentRole/cs2.NoAttrition$YearsAtCompany)%>%
  mutate(year4=cs2.NoAttrition$YearsInCurrentRole/cs2.NoAttrition$TotalWorkingYears)%>%
  mutate(year5=cs2.NoAttrition$YearsAtCompany/cs2.NoAttrition$TotalWorkingYears)


cs2.NoSalary=cs2.NoSalary%>%
  mutate(year2=cs2.NoSalary$YearsInCurrentRole/cs2.NoSalary$YearsAtCompany)%>%
  mutate(year4=cs2.NoSalary$YearsInCurrentRole/cs2.NoSalary$TotalWorkingYears)%>%
  mutate(year5=cs2.NoSalary$YearsAtCompany/cs2.NoSalary$TotalWorkingYears)


df$year2=as.numeric(df$year2)
df$year4=as.numeric(df$year4)
df$year5=as.numeric(df$year5)

cs2.NoAttrition$year2=as.numeric(cs2.NoAttrition$year2)
cs2.NoAttrition$year4=as.numeric(cs2.NoAttrition$year4)
cs2.NoAttrition$year5=as.numeric(cs2.NoAttrition$year5)


cs2.NoSalary$year2=as.numeric(cs2.NoSalary$year2)
cs2.NoSalary$year4=as.numeric(cs2.NoSalary$year4)
cs2.NoSalary$year5=as.numeric(cs2.NoSalary$year5)

Dealing with missing data on in the new datasets

plot_missing(df)

sum(is.na(df))
## [1] 42
for(i in 1:ncol(df))
{
  if(is.numeric(df[,i]))
  {
   df[is.na(df[,i]), i] <- median(df[,i], na.rm = TRUE)
  }
}
sum(is.na(df))
## [1] 0
for(i in 1:ncol(cs2.NoAttrition))
{
  if(is.numeric(cs2.NoAttrition[,i]))
  {
    cs2.NoAttrition[is.na(cs2.NoAttrition[,i]), i] <- median(cs2.NoAttrition[,i], na.rm = TRUE)
  }
}
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(cs2.NoSalary))
{
  if(is.numeric(cs2.NoSalary[,i]))
  {
    cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
  }
}
plot_missing(df)

sum(is.na(df))
## [1] 0

statistical analysis on the new variables:

wilcox.test(year2 ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  year2 by Attrition
## W = 57437, p-value = 0.01949
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(year4 ~ Attrition,data = df, alternative = "two.sided")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  year4 by Attrition
## W = 58776, p-value = 0.004714
## alternative hypothesis: true location shift is not equal to 0
cor.test(df$year5, df$MonthlyIncome,
         method= "spearman",
         exact=FALSE,
         alternative="two.side")
## 
##  Spearman's rank correlation rho
## 
## data:  df$year5 and df$MonthlyIncome
## S = 1.29e+08, p-value = 1.912e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1754371

define the most important variables to determine attrition

library(xgboost)
tree <- rpart(Attrition ~ ., data = df)
# Fit an RF
set.seed(101) # for reproducibility
rfo <- randomForest(Attrition ~ ., data = df, importance = TRUE)
# Fit a GBM
set.seed(102) # for reproducibility

# Extract VI scores from each model
vi_tree <- tree$variable.importance
vi_rfo <- rfo$variable.importance # or use `randomForest::importance(rfo)`


# Load required packages
library(vip)
# Compute model-specific VI scores
vi(tree) # CART-like decision tree
## # A tibble: 26 × 2
##    Variable          Importance
##    <chr>                  <dbl>
##  1 MonthlyIncome          21.2 
##  2 OverTime               17.4 
##  3 DailyRate              10.5 
##  4 Age                     8.62
##  5 StockOptionLevel        8.34
##  6 JobRole                 6.97
##  7 TotalWorkingYears       6.60
##  8 JobSatisfaction         6.12
##  9 MaritalStatus           5.97
## 10 year4                   5.91
## # … with 16 more rows
vi(rfo) # RF
## # A tibble: 39 × 2
##    Variable          Importance
##    <chr>                  <dbl>
##  1 OverTime               18.3 
##  2 MonthlyIncome          11.5 
##  3 StockOptionLevel       11.5 
##  4 Age                    10.2 
##  5 JobRole                 9.03
##  6 MaritalStatus           8.71
##  7 YearsAtCompany          7.74
##  8 JobLevel                6.76
##  9 JobInvolvement          6.35
## 10 TotalWorkingYears       6.05
## # … with 29 more rows
p1 <- vip(tree) + ggtitle("Single tree")

p2 <- vip(rfo) + ggtitle("Random forest")




# Display plots in a grid (Figure 1)
grid.arrange(p1, p2, p3, nrow = 1)

#Creat new datasets with the most important variables

df_attrition=df%>%
  select(OverTime
         ,JobSatisfaction,
         JobInvolvement,
         StockOptionLevel,
         WorkLifeBalance,
         EnvironmentSatisfaction,
         Department,
         JobRole,
         NumCompaniesWorked,
         MonthlyIncome,
         DistanceFromHome,
         YearsWithCurrManager,
         YearsAtCompany,
         YearsInCurrentRole,
         TotalWorkingYears,
         year2,
         year4
  )


cs2.NoAttrition=cs2.NoAttrition%>%select(
  OverTime,
  JobSatisfaction,
  JobInvolvement,
  StockOptionLevel,
  WorkLifeBalance,
  EnvironmentSatisfaction,
  Department,
  JobRole,
  NumCompaniesWorked,
  MonthlyIncome,
  DistanceFromHome,
  YearsWithCurrManager,
  YearsAtCompany,
  YearsInCurrentRole,
  TotalWorkingYears,
  year2,
  year4,
  
)


df_for_salary=df%>%
  select(Age, YearsSinceLastPromotion, YearsSinceLastPromotion,
         YearsWithCurrManager,
         year5,
         YearsAtCompany,
         YearsInCurrentRole,
         TotalWorkingYears,
         StockOptionLevel,
         JobLevel,
         Education,
         MaritalStatus,
         JobRole, 
         Gender, 
         EducationField,
         Department, 
         Attrition,
         NumCompaniesWorked)


cs2.NoSalary=cs2.NoSalary%>%
  select(Age, YearsSinceLastPromotion, YearsSinceLastPromotion,
         YearsWithCurrManager,
         year5,
         YearsAtCompany,
         YearsInCurrentRole,
         TotalWorkingYears,
         StockOptionLevel,
         JobLevel,
         Education,
         MaritalStatus,
         JobRole, 
         Gender, 
         EducationField,
         Department, 
         Attrition,
         NumCompaniesWorked
  )

#Converting factor variables to numeric variables(dummies)

dmy <- dummyVars(" ~ .", data = df_attrition)
df_attrition1 <- data.frame(predict(dmy, newdata = df_attrition))



dmy1=dummyVars(" ~ .", data = cs2.NoAttrition)
cs2.NoAttrition_test<- data.frame(predict(dmy1, newdata = cs2.NoAttrition))

dmy2<- dummyVars(" ~ .", data = df_for_salary)
df_for_salary1 <- data.frame(predict(dmy2, newdata = df_for_salary))


cs2.NoSalary$StockOptionLevel=as.factor(cs2.NoSalary$StockOptionLevel)
dmy3=dummyVars(" ~ .", data = cs2.NoSalary)
cs2.NoSalary_test<- data.frame(predict(dmy3, newdata = cs2.NoSalary))


df_attrition2=cbind(df_attrition1,new_col=df["Attrition"])

df_for_salary2=cbind(df_for_salary1,new_col=df["MonthlyIncome"])

#Normalizing the numeric variables

library(tidyverse)

normalize=function(x){
  return((x-min(x))/(max(x)-min(x)))
}

df_attrition2=df_attrition2%>%
  mutate(DistanceFromHome=normalize(DistanceFromHome))%>%
  mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
  mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
  mutate(TotalWorkingYears=normalize(TotalWorkingYears))%>%
  mutate(YearsInCurrentRole=normalize(YearsInCurrentRole))%>%
  mutate(MonthlyIncome=normalize(MonthlyIncome))

cs2.NoAttrition_test= cs2.NoAttrition_test%>%
  mutate(DistanceFromHome=normalize(DistanceFromHome))%>%
  mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
  mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
  mutate(TotalWorkingYears=normalize(TotalWorkingYears))%>%
  mutate(YearsInCurrentRole=normalize(YearsInCurrentRole))%>%
  mutate(MonthlyIncome=normalize(MonthlyIncome))


df_for_salary2=df_for_salary2%>%
  mutate(Age=normalize(Age))%>%
  mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
  mutate(YearsSinceLastPromotion=normalize(YearsSinceLastPromotion))%>%
  mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
  mutate(year5=normalize(year5))%>%
  mutate(TotalWorkingYears=normalize(TotalWorkingYears))


cs2.NoSalary_test= cs2.NoSalary_test%>%
  mutate(Age=normalize(Age))%>%
  mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
  mutate(YearsSinceLastPromotion=normalize(YearsSinceLastPromotion))%>%
  mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
  mutate(year5=normalize(year5))%>%
  mutate(TotalWorkingYears=normalize(TotalWorkingYears))

#Dealing with the missing data

plot_missing(df)

sum(is.na(df))
## [1] 0
for(i in 1:ncol(df))
{
  if(is.numeric(df[,i]))
  {
    df[is.na(df[,i]), i] <- median(df[,i], na.rm = TRUE)
  }
}
sum(is.na(df))
## [1] 0
for(i in 1:ncol(cs2.NoAttrition))
{
  if(is.numeric(cs2.NoAttrition[,i]))
  {
    cs2.NoAttrition[is.na(cs2.NoAttrition[,i]), i] <- median(cs2.NoAttrition[,i], na.rm = TRUE)
  }
}
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(cs2.NoSalary))
{
  if(is.numeric(cs2.NoSalary[,i]))
  {
    cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
  }
}
plot_missing(df)

sum(is.na(df))
## [1] 0

#Using naive bayes to predict attrition

set.seed(1234)
sample_set=sample(nrow(df_attrition2),round(nrow(df_attrition2)*0.75),replace=FALSE)
df_attrition2_train=df_attrition2[sample_set,]
df_attrition2_test=df_attrition2[-sample_set,]
naive =  naiveBayes(Attrition ~ ., data = df_attrition2_train)

pred =predict(naive,df_attrition2_test,type="class")
 
pred_table=table(df_attrition2_test$Attrition,pred)
pred_table
##      pred
##        No Yes
##   No  130  49
##   Yes   8  31
sum(diag(pred_table))/nrow(df_attrition2_test)
## [1] 0.7385321
tab.naive = table(predicted = pred,Actual =  df_attrition2_test$Attrition)
(sensitivity = round(tab.naive[2,2]*100/(tab.naive[2,2] + tab.naive[1,2])))
## [1] 79
(specificity = round(tab.naive[1,1]*100/(tab.naive[1,1] + tab.naive[2,1])))
## [1] 73
final.cls.model = naiveBayes(Attrition ~ ., data = df_attrition2)
pred4 = predict(final.cls.model, newdata = df_attrition2)

tab.naive = table(predicted = pred4,Actual =  df_attrition2$Attrition)
(sensitivity = round(tab.naive[2,2]*100/(tab.naive[2,2] + tab.naive[1,2])))
## [1] 80
(specificity = round(tab.naive[1,1]*100/(tab.naive[1,1] + tab.naive[2,1])))
## [1] 72
attrition_matrix=confusionMatrix(pred4,df_attrition2$Attrition,positive= "Yes")
attrition_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  525  28
##        Yes 205 112
##                                           
##                Accuracy : 0.7322          
##                  95% CI : (0.7014, 0.7614)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3436          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.7192          
##          Pos Pred Value : 0.3533          
##          Neg Pred Value : 0.9494          
##              Prevalence : 0.1609          
##          Detection Rate : 0.1287          
##    Detection Prevalence : 0.3644          
##       Balanced Accuracy : 0.7596          
##                                           
##        'Positive' Class : Yes             
## 
precision=posPredValue(pred4,df_attrition2$Attrition,positive= "Yes") 
precision
## [1] 0.3533123
sensitivity=sensitivity(pred4,df_attrition2$Attrition,positive= "Yes")
sensitivity
## [1] 0.8
specificity=specificity(pred,df_attrition2$Attrition,negative= "No")
specificity
## [1] 0.6328767
f1_score=(2*precision*sensitivity)/(precision+sensitivity)
f1_score
## [1] 0.4901532
predfinal=predict(final.cls.model,newdata=cs2.NoAttrition_test)
predfinal
##   [1] No  Yes Yes No  No  No  No  No  No  Yes Yes Yes No  No  No  No  No  No 
##  [19] No  No  Yes No  No  Yes Yes No  Yes Yes No  Yes Yes No  No  Yes Yes No 
##  [37] No  No  No  No  Yes Yes No  No  Yes No  No  No  Yes No  No  Yes No  No 
##  [55] No  No  No  Yes No  No  No  No  Yes No  No  Yes No  No  No  No  Yes No 
##  [73] No  No  No  Yes No  Yes No  Yes No  No  No  No  No  No  No  No  No  Yes
##  [91] No  No  Yes No  No  No  Yes No  Yes Yes No  No  No  No  Yes No  No  No 
## [109] Yes No  No  No  No  No  No  No  No  No  Yes No  No  No  No  No  No  No 
## [127] Yes No  No  No  Yes No  No  Yes No  No  No  Yes Yes Yes Yes No  No  Yes
## [145] No  Yes Yes Yes Yes No  No  Yes No  No  Yes Yes Yes Yes Yes No  No  No 
## [163] Yes No  No  Yes No  No  No  Yes No  Yes Yes Yes No  No  No  Yes No  No 
## [181] Yes No  No  No  No  No  No  Yes No  No  Yes No  No  No  Yes Yes Yes No 
## [199] No  No  No  No  Yes No  No  No  No  No  Yes No  Yes No  No  No  No  Yes
## [217] No  No  No  No  Yes No  No  No  No  No  Yes No  Yes No  Yes Yes No  Yes
## [235] No  No  Yes Yes No  No  Yes No  No  Yes No  No  Yes No  No  Yes Yes No 
## [253] No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes No 
## [271] Yes No  No  Yes Yes Yes Yes No  Yes Yes No  Yes No  No  Yes Yes Yes Yes
## [289] Yes Yes No  No  No  No  Yes No  No  Yes No  No 
## Levels: No Yes
cs2.NoAttrition =  read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Attrition.csv",stringsAsFactors = TRUE)
pred.df = data.frame(ID = cs2.NoAttrition$ID, Attrition = predfinal)

#——————————— # Variable Importance based on naiive bayes model

Grid = data.frame(usekernel=TRUE,laplace = 0,adjust=1)
mdl = train(Attrition ~ .,data=df_attrition2,method="naive_bayes",
            trControl=trainControl(method="none"),
            tuneGrid=Grid)
varImp(mdl)
## ROC curve variable importance
## 
##   only 20 most important variables shown (out of 43)
## 
##                                  Importance
## OverTimeNo                           100.00
## OverTimeYes                          100.00
## StockOptionLevel.0                    93.76
## MonthlyIncome                         93.29
## TotalWorkingYears                     93.05
## YearsAtCompany                        87.44
## YearsInCurrentRole                    83.39
## YearsWithCurrManager                  76.68
## StockOptionLevel.1                    76.15
## year4                                 44.21
## JobInvolvement.3                      39.26
## JobSatisfaction.4                     39.00
## JobRole.Sales.Representative          38.65
## DepartmentResearch...Development      38.56
## DepartmentSales                       37.62
## year2                                 36.34
## JobInvolvement.1                      36.00
## EnvironmentSatisfaction.1             35.71
## DistanceFromHome                      34.31
## JobRole.Manufacturing.Director        29.76
for(i in 1:ncol(df))
{
  if(is.numeric(df[,i]))
  {
    df[is.na(df[,i]), i] <- median(df[,i], na.rm = TRUE)
  }
}
sum(is.na(df))
## [1] 0
for(i in 1:ncol(cs2.NoAttrition))
{
  if(is.numeric(cs2.NoAttrition[,i]))
  {
    cs2.NoAttrition[is.na(cs2.NoAttrition[,i]), i] <- median(cs2.NoAttrition[,i], na.rm = TRUE)
  }
}
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(cs2.NoSalary))
{
  if(is.numeric(cs2.NoSalary[,i]))
  {
    cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
  }
}
plot_missing(df)

sum(is.na(df))
## [1] 0
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(df_attrition2))
{
  if(is.numeric(df_attrition2[,i]))
  {
    df_attrition2[is.na(df_attrition2[,i]), i] <- median(df_attrition2[,i], na.rm = TRUE)
  }
}
sum(is.na(df_attrition2))
## [1] 0

#Use of GBM model to predict attrition ### GBM ###

library(gbm)
df_attrition2$Attrition = ifelse(df_attrition2$Attrition == "Yes",1,0)
gbm.mod_final = gbm(formula = Attrition ~ ., data = df_attrition2,interaction.depth = 3,
                    distribution = "bernoulli", n.trees = 5000,shrinkage = 0.1,
                    n.minobsinnode = 10,cv.folds = 10)
pred10 = predict(gbm.mod_final, newdata = df_attrition2,type = "response")
pred10 = ifelse(pred10 > 0.5,1,0)
tab.gbm = table(predicted = pred10,Actual =  df_attrition2$Attrition)
(sensitivity = round(tab.gbm[2,2]*100/(tab.gbm[2,2] + tab.gbm[1,2])))
## [1] 51
(specificity = round(tab.gbm[1,1]*100/(tab.gbm[1,1] + tab.gbm[2,1])))
## [1] 99

#GBM had a sensitivity of 99% , however the speficiy was not high

#Use of KNN for prediction od attrition

#——————-KNN———————-

library(class)
idx = sample.int(n = nrow(df_attrition2), size = floor(0.75*nrow(df_attrition2)), replace = F)
train = df_attrition2[idx,]


test = df_attrition2[-idx,]
trn_target = train$Attrition
trn = train[,-44]
tst_target = test$Attrition
tst = test[,-44]

pred = knn(train = trn, test = tst, cl = trn_target, k = 6)
model_table=table(tst_target,pred)
model_table
##           pred
## tst_target   0   1
##          0 179   5
##          1  29   5
sum(diag(model_table))/nrow(tst)
## [1] 0.8440367
Accuracy = NULL
mis = NULL
sen = NULL
spe = NULL

for(i in 1:50)
{
  pred = knn(train = trn, test = tst, cl = trn_target, k = i)
  head(pred)
  model_table=table(trn_target)
  tab = table(Predicted = pred, Real = tst_target)
  Accuracy[i] = ((tab[1,1] + tab[2,2])/sum(tab))*100
  mis[i] = round((tab[1,2]+tab[2,1])/sum(tab),2)
  sen[i] = round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
  spe[i] = round(tab[1,1]/(tab[1,1]+tab[2,1]),2)
}
plot(x = c(1:50), y = Accuracy, xlab = "k", pch = 19, type = "b")
abline(v = which.max(Accuracy), col = "red", lwd = 2)

data.frame(Measure = c("Accuracy","Misclassification Rate","Sensitivity","Specificity"),
           Value = c(round(Accuracy[6],2),round(mis[6],2),round(sen[6],2),round(spe[6],2)))
##                  Measure Value
## 1               Accuracy 84.40
## 2 Misclassification Rate  0.16
## 3            Sensitivity  0.15
## 4            Specificity  0.97
Attrition=cs2.NoAttrition_test$Attrition

#———————————–

for(i in 1:ncol(cs2.NoSalary))
{
  if(is.numeric(cs2.NoSalary[,i]))
  {
    cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
  }
}
set.seed(2021) 
train_ind = sample(seq_len(nrow(df_for_salary2)), size = floor(0.7 * nrow(df_for_salary2)))
train = df_for_salary2[train_ind, ]
test = df_for_salary2[-train_ind, ]

model2 = lm(MonthlyIncome ~ ., data = train)
summary(model2)
## 
## Call:
## lm(formula = MonthlyIncome ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2868.1  -647.5   -99.7   627.6  4304.6 
## 
## Coefficients: (9 not defined because of singularities)
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        13341.88     638.10  20.909  < 2e-16 ***
## Age                                 -268.64     276.89  -0.970   0.3324    
## YearsSinceLastPromotion              436.68     267.49   1.632   0.1031    
## YearsWithCurrManager                  33.59     329.79   0.102   0.9189    
## year5                               -585.45     287.94  -2.033   0.0425 *  
## YearsAtCompany                       424.58     745.11   0.570   0.5690    
## YearsInCurrentRole                    27.52      20.53   1.340   0.1806    
## TotalWorkingYears                   1135.04     645.83   1.757   0.0794 .  
## StockOptionLevel.0                  -114.09     208.17  -0.548   0.5839    
## StockOptionLevel.1                    40.28     172.88   0.233   0.8159    
## StockOptionLevel.2                   -12.94     210.30  -0.062   0.9510    
## StockOptionLevel.3                       NA         NA      NA       NA    
## JobLevel.1                        -10678.14     413.78 -25.806  < 2e-16 ***
## JobLevel.2                         -9266.65     354.70 -26.125  < 2e-16 ***
## JobLevel.3                         -6081.86     323.01 -18.828  < 2e-16 ***
## JobLevel.4                         -2660.55     276.21  -9.632  < 2e-16 ***
## JobLevel.5                               NA         NA      NA       NA    
## Education.1                          178.76     270.95   0.660   0.5097    
## Education.2                          201.01     251.16   0.800   0.4239    
## Education.3                          171.50     243.11   0.705   0.4808    
## Education.4                          294.01     247.16   1.190   0.2347    
## Education.5                              NA         NA      NA       NA    
## MaritalStatusDivorced               -304.53     181.77  -1.675   0.0944 .  
## MaritalStatusMarried                 -37.56     145.48  -0.258   0.7963    
## MaritalStatusSingle                      NA         NA      NA       NA    
## JobRole.Healthcare.Representative    729.54     472.25   1.545   0.1229    
## JobRole.Human.Resources              178.55     610.32   0.293   0.7700    
## JobRole.Laboratory.Technician       -662.95     435.08  -1.524   0.1281    
## JobRole.Manager                     4632.79     425.54  10.887  < 2e-16 ***
## JobRole.Manufacturing.Director       913.72     469.55   1.946   0.0522 .  
## JobRole.Research.Director           4446.30     505.61   8.794  < 2e-16 ***
## JobRole.Research.Scientist          -462.98     435.64  -1.063   0.2883    
## JobRole.Sales.Executive             1490.50     254.09   5.866 7.57e-09 ***
## JobRole.Sales.Representative             NA         NA      NA       NA    
## GenderFemale                         -35.47      87.02  -0.408   0.6837    
## GenderMale                               NA         NA      NA       NA    
## EducationFieldHuman.Resources        -63.83     452.84  -0.141   0.8880    
## EducationFieldLife.Sciences          167.75     157.06   1.068   0.2859    
## EducationFieldMarketing               95.59     204.94   0.466   0.6411    
## EducationFieldMedical                -23.85     163.47  -0.146   0.8841    
## EducationFieldOther                   36.61     221.01   0.166   0.8685    
## EducationFieldTechnical.Degree           NA         NA      NA       NA    
## DepartmentHuman.Resources            278.58     572.14   0.487   0.6265    
## DepartmentResearch...Development     693.29     390.12   1.777   0.0761 .  
## DepartmentSales                          NA         NA      NA       NA    
## Attrition.No                          16.61     122.87   0.135   0.8925    
## Attrition.Yes                            NA         NA      NA       NA    
## NumCompaniesWorked                   -12.93      20.79  -0.622   0.5342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1016 on 570 degrees of freedom
## Multiple R-squared:  0.9553, Adjusted R-squared:  0.9524 
## F-statistic: 320.9 on 38 and 570 DF,  p-value: < 2.2e-16
pred1 = predict(model2, newdata = test)
(RMSE.test = sqrt(mean((pred1 - test$MonthlyIncome)^2)))
## [1] 1032.488
final.reg.model = lm(MonthlyIncome ~ ., data = df_for_salary2)
summary(final.reg.model)
## 
## Call:
## lm(formula = MonthlyIncome ~ ., data = df_for_salary2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3156.3  -650.8   -81.5   590.5  4268.7 
## 
## Coefficients: (9 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        13586.419    540.253  25.148  < 2e-16 ***
## Age                                 -137.647    227.992  -0.604   0.5462    
## YearsSinceLastPromotion              176.497    219.869   0.803   0.4224    
## YearsWithCurrManager                 -60.752    278.907  -0.218   0.8276    
## year5                               -585.410    239.165  -2.448   0.0146 *  
## YearsAtCompany                       538.091    630.979   0.853   0.3940    
## YearsInCurrentRole                    27.348     16.480   1.660   0.0974 .  
## TotalWorkingYears                    939.857    552.371   1.701   0.0892 .  
## StockOptionLevel.0                   -34.970    177.729  -0.197   0.8441    
## StockOptionLevel.1                    67.901    151.455   0.448   0.6540    
## StockOptionLevel.2                   -18.330    181.820  -0.101   0.9197    
## StockOptionLevel.3                        NA         NA      NA       NA    
## JobLevel.1                        -11179.803    343.257 -32.570  < 2e-16 ***
## JobLevel.2                         -9461.699    295.101 -32.063  < 2e-16 ***
## JobLevel.3                         -6205.247    267.645 -23.185  < 2e-16 ***
## JobLevel.4                         -2729.978    227.777 -11.985  < 2e-16 ***
## JobLevel.5                                NA         NA      NA       NA    
## Education.1                          500.210    233.325   2.144   0.0323 *  
## Education.2                          430.953    219.298   1.965   0.0497 *  
## Education.3                          379.028    213.130   1.778   0.0757 .  
## Education.4                          540.030    215.779   2.503   0.0125 *  
## Education.5                               NA         NA      NA       NA    
## MaritalStatusDivorced               -107.825    149.447  -0.721   0.4708    
## MaritalStatusMarried                 -23.053    117.309  -0.197   0.8443    
## MaritalStatusSingle                       NA         NA      NA       NA    
## JobRole.Healthcare.Representative    712.045    383.363   1.857   0.0636 .  
## JobRole.Human.Resources             -128.198    501.019  -0.256   0.7981    
## JobRole.Laboratory.Technician       -488.433    353.682  -1.381   0.1677    
## JobRole.Manager                     4242.045    341.116  12.436  < 2e-16 ***
## JobRole.Manufacturing.Director       840.370    381.598   2.202   0.0279 *  
## JobRole.Research.Director           4247.095    415.238  10.228  < 2e-16 ***
## JobRole.Research.Scientist          -270.775    352.637  -0.768   0.4428    
## JobRole.Sales.Executive             1184.899    205.099   5.777 1.07e-08 ***
## JobRole.Sales.Representative              NA         NA      NA       NA    
## GenderFemale                         -85.110     71.102  -1.197   0.2316    
## GenderMale                                NA         NA      NA       NA    
## EducationFieldHuman.Resources          9.144    366.428   0.025   0.9801    
## EducationFieldLife.Sciences           84.397    129.844   0.650   0.5159    
## EducationFieldMarketing               49.249    171.111   0.288   0.7736    
## EducationFieldMedical                  8.696    134.686   0.065   0.9485    
## EducationFieldOther                   16.237    186.473   0.087   0.9306    
## EducationFieldTechnical.Degree            NA         NA      NA       NA    
## DepartmentHuman.Resources            328.549    472.078   0.696   0.4866    
## DepartmentResearch...Development     491.587    315.076   1.560   0.1191    
## DepartmentSales                           NA         NA      NA       NA    
## Attrition.No                          39.908    102.509   0.389   0.6971    
## Attrition.Yes                             NA         NA      NA       NA    
## NumCompaniesWorked                    -7.074     17.501  -0.404   0.6862    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1008 on 831 degrees of freedom
## Multiple R-squared:  0.954,  Adjusted R-squared:  0.9519 
## F-statistic:   454 on 38 and 831 DF,  p-value: < 2.2e-16
pred1 = predict(final.reg.model, newdata = df_for_salary2)
(RMSE.tr.reg = sqrt(mean((pred1 - df_for_salary2$MonthlyIncome)^2)))
## [1] 985.0243

lets do some data preprocessing

cs2.NoSalary$StockOptionLevel=as.factor(cs2.NoSalary$StockOptionLevel)
cs2.NoSalary_test$MonthlyIncome=NaN

lets do some data preprocessing

names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Sales"] <- "DepartmentSales"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.No"] <- "OverTimeNo"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Sales"] <- "DepartmentSales"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.No"] <- "OverTimeNo"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Sales"] <- "DepartmentSales"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.No"] <- "OverTimeNo"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
cs2.NoSalary1 = read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Salary.csv",stringsAsFactors = TRUE)
pred2 = predict(final.reg.model, newdata = cs2.NoSalary_test)

pred.df = data.frame(ID = cs2.NoSalary1$ID, MonthlyIncome = pred2)